| Jobs | 1 |
| Input files |
- /2/scratch/keaton/plague-phylogeography/results/parse_tree/{reads_origin}/{locus_name}_filter{missing_data}/parse_tree.nwk
- /2/scratch/keaton/plague-phylogeography/results/parse_tree/{reads_origin}/{locus_name}_filter{missing_data}/parse_tree.tsv
|
| Output files |
- /2/scratch/keaton/plague-phylogeography/results/clock/{reads_origin}/{locus_name}_filter{missing_data}/clock_model_timetree.nwk
- /2/scratch/keaton/plague-phylogeography/results/clock/{reads_origin}/{locus_name}_filter{missing_data}/clock_model.tsv
- /2/scratch/keaton/plague-phylogeography/results/clock/{reads_origin}/{locus_name}_filter{missing_data}/clock_model_skyline.svg
|
| Code |
| # Objectives
1. Estimate Clock Model
1. Filter Outliers
- Plot Prune Compare: Divtree
- Plot Prune Compare: Timetree
1. Add Clock Model to Dataframe
NOTE: Filter outliers after clock model estimated?
|
|
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | # Bio
import treetime
from Bio import Phylo
# Plotting
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns
sns.set_style('whitegrid')
# Stats
import pandas as pd
# System IO
import dill
import copy
import os
import io
import sys
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 | from config import *
# Custom script variables
SCRIPT_NAME = "clock"
PREV_DIR_NAME = "parse_tree"
PREV_SCRIPT_NAME = "parse_tree"
try:
WILDCARDS = snakemake.wildcards
project_dir = os.getcwd()
except NameError:
WILDCARDS = ["all", "chromosome", "50"]
project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
READS_ORIGIN = WILDCARDS[0]
LOCUS_NAME = WILDCARDS[1]
MISSING_DATA = WILDCARDS[2]
NAME_COL = "Name"
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 | config_dir = os.path.join(project_dir, "config")
results_dir = os.path.join(project_dir, "results")
outdir = os.path.join(results_dir, SCRIPT_NAME, READS_ORIGIN, LOCUS_NAME + "_filter{}".format(MISSING_DATA),)
tree_dill = os.path.join(results_dir, PREV_DIR_NAME, READS_ORIGIN, LOCUS_NAME + "_filter{}".format(MISSING_DATA),PREV_SCRIPT_NAME + ".phylo.obj")
tree_df_dill = os.path.join(results_dir, PREV_DIR_NAME, READS_ORIGIN, LOCUS_NAME + "_filter{}".format(MISSING_DATA), PREV_SCRIPT_NAME + ".df.obj")
tree_df_path = os.path.join(results_dir, PREV_DIR_NAME, READS_ORIGIN, LOCUS_NAME + "_filter{}".format(MISSING_DATA), PREV_SCRIPT_NAME + ".tsv")
aln_path = os.path.join(results_dir,"snippy_multi",READS_ORIGIN,"snippy-core_{}.snps.filter{}.aln".format(LOCUS_NAME, MISSING_DATA))
# Auspice
auspice_latlon_path = os.path.join(results_dir, "parse_tree", READS_ORIGIN, LOCUS_NAME + "_filter{}".format(MISSING_DATA), "parse_tree" + "_latlon.tsv")
auspice_colors_path = os.path.join(results_dir, "parse_tree", READS_ORIGIN, LOCUS_NAME + "_filter{}".format(MISSING_DATA), "parse_tree" + "_colors.tsv")
auspice_config_path = os.path.join(config_dir, "auspice_config.json")
auspice_remote_dir_path = os.path.join(project_dir, "auspice/")
print("tree_dill:\t", tree_dill)
print("tree_df_dill:\t", tree_df_dill)
print("aln path:\t", aln_path)
print("auspice_latlon_path:", auspice_latlon_path)
print("auspice_colors_path:", auspice_colors_path)
print("auspice_config_path:", auspice_config_path)
print("auspice_remote_dir_path:", auspice_remote_dir_path)
print("outdir:", outdir)
# Create output directory if it doesn't exist
while not os.path.exists(outdir):
os.makedirs(outdir)
SCRIPT_NAME = "clock_model"
|
|
|
|
| with open(tree_dill, "rb") as infile:
tree = dill.load(infile)
tree.ladderize(reverse=False)
|
|
|
|
| with open(tree_df_dill, "rb") as infile:
tree_df = dill.load(infile)
tree_df
|
|
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 | # Use the utils function to parse the metadata dates
dates_raw = treetime.utils.parse_dates(tree_df_path,
date_col=DATE_COL,
name_col = NAME_COL)
# Remove nan elements (internal nodes)
dates = {}
for k,v in dates_raw.items():
if type(v) == list:
dates[k] = v
elif not pd.isnull(v):
dates[k] = v
# Add Reference
dates["Reference"] = REF_DATE
|
|
| ---
# 1. Estimate Clock Model
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45 | # Initialize stdout capture
print("Estimating clock model...")
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
# branch_length_mode:input --> tMRCA ~20,000 YBP
tt = treetime.TreeTime(dates=dates,
aln=aln_path,
tree=tree,
verbose=4,
fill_overhangs=False,
seq_len=REF_LEN,
)
# 11:26 AM -
#tt.run(MAX_ITER=1)
tt.run(
# Stable Parameters
max_iter=3,
n_iqd=3,
verbose=4,
infer_gtr=True,
use_covariation=False,
root=tt.tree.root,
resolve_polytomies=True,
vary_rate=True,
relaxed_clock={"slack" : 0.1, "coupling": 0},
# Variable Parameters
branch_length_mode = "joint", # joint or marginal
time_marginal=True, # True, False or "assign"
Tc="skyline",
)
tt.tree.ladderize(reverse=False)
# Save stdout to file
output = new_stdout.getvalue()
out_path = os.path.join(outdir, SCRIPT_NAME + "_estimate.log")
with open(out_path, "w") as file:
file.write(output)
# Restore stdout
sys.stdout = old_stdout
print("Standard output restored, logging to file disabled.")
|
|
|
|
| # Common Ancestor
tt.tree.common_ancestor("NODE0")
|
|
| # RTT Regression
tt.plot_root_to_tip(add_internal=True, label=True)
# Save
out_path = os.path.join(outdir, SCRIPT_NAME + "_rtt-pre." + FMT)
plt.savefig(out_path,
dpi=dpi,
bbox_inches = "tight")
|
|
|
|
| ## Prune 'Bad' Branches from Tree
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | tt_pre = copy.deepcopy(tt)
tt_prune = copy.deepcopy(tt)
# Color branches and set to divtree
for n in tt_pre.tree.find_clades():
n.branch_length=n.mutation_length
if n.bad_branch:
n.color = "red"
for n in tt_prune.tree.find_clades():
n.branch_length=n.mutation_length
bad_nodes = [c for c in tt_prune.tree.find_clades() if c.bad_branch]
while len(bad_nodes) > 0:
for node in bad_nodes:
if node.is_terminal():
print("PRUNING:", node.name)
tt_prune.tree.prune(node)
bad_nodes = [c for c in tt_prune.tree.find_clades() if c.bad_branch]
|
|
| ## Plot Prune Comparison: Divtree
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 | sns.set_style("ticks")
fig, (ax1,ax2) = plt.subplots(2, sharex=True, dpi=dpi)
Phylo.draw(tt_pre.tree, show_confidence=False, label_func = lambda x: '', do_show=False, axes=ax1,)
Phylo.draw(tt_prune.tree, show_confidence=False, label_func = lambda x: '', do_show=False, axes=ax2,)
ax1.set_title("Divergence Tree Pre-Pruning")
ax1.set_xlabel("")
ax1.set_ylabel("")
ax1.set_yticks([])
ax1.spines["right"].set_visible(False)
ax1.spines["top"].set_visible(False)
ax1.spines["left"].set_visible(False)
ax2.set_title("Divergence Tree Post-Pruning")
ax2.set_xlabel("Branch Length")
ax2.set_ylabel("")
ax2.set_ylabel("")
ax2.set_yticks([])
ax2.spines["right"].set_visible(False)
ax2.spines["top"].set_visible(False)
ax2.spines["left"].set_visible(False)
# Save
out_path = os.path.join(outdir, SCRIPT_NAME + "_divtree-prune." + FMT)
plt.savefig(out_path, dpi=dpi, bbox_inches = "tight")
|
|
| ## Plot Prune Comparison: Timetree
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 | # Pre-pruning
if hasattr(tt.tree.root, "marginal_inverse_cdf"):
fig, ax1 = treetime.treetime.plot_vs_years(tt_pre, label_func = lambda x:"", do_show=False, show_confidence=False, confidence=(1-CONFIDENCE, CONFIDENCE))
else:
fig, ax1 = treetime.treetime.plot_vs_years(tt_pre, label_func = lambda x:"" ,do_show=False ,show_confidence=False)
ax1.set_title("Time Tree Pre-Pruning")
# Save
out_path = os.path.join(outdir, SCRIPT_NAME + "_timetree-prune-pre." + FMT)
plt.savefig(out_path, dpi=dpi, bbox_inches = "tight")
# Post-pruning
if hasattr(tt.tree.root, "marginal_inverse_cdf"):
fig, ax2 = treetime.treetime.plot_vs_years(tt_prune, label_func = lambda x:"", do_show=False, show_confidence=False, confidence=(1-CONFIDENCE, CONFIDENCE))
else:
fig, ax2 = treetime.treetime.plot_vs_years(tt_prune, label_func = lambda x:"" ,do_show=False, show_confidence=False)
ax2.set_title("Time Tree Post-Pruning")
# Save
out_path = os.path.join(outdir, SCRIPT_NAME + "_timetree-prune-post." + FMT)
plt.savefig(out_path, dpi=dpi, bbox_inches = "tight")
|
|
|
|
|
|
| ## Remove Collapsed Nodes from Dataframe
|
|
| tt_nodes = [c.name for c in tt.tree.find_clades()]
for rec in tree_df.iterrows():
node = rec[0]
if node not in tt_nodes:
tree_df.drop(node, inplace=True)
tree_df
|
|
| ---
# 3. Add Clock Model to Dataframe
- Rates
- Dates
- RTT Regression
- Skyline
- coord_x and coord_y
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 | tree_df["timetree_rate"] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
tree_df["timetree_rate_fold_change"] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
tree_df["timetree_mutation_length"] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
# The mean rate is the slope
mean_rate = tt.clock_model["slope"]
for c in tt.tree.find_clades():
tree_df.at[c.name, "timetree_mutation_length"] = c.mutation_length
# Relaxed Clock
if hasattr(c, "branch_length_interpolator") and c.branch_length_interpolator:
g = c.branch_length_interpolator.gamma
tree_df.at[c.name, "timetree_rate_fold_change"] = g
tree_df.at[c.name, "timetree_rate"] = mean_rate * g
# Strict Clock
else:
tree_df.at[c.name, "timetree_rate_fold_change"] = 1
tree_df.at[c.name, "timetree_rate"] = mean_rate
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 | # Create new columns
tree_df["timetree_raw_date"] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
tree_df["timetree_num_date"] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
if hasattr(tt.tree.root, "marginal_inverse_cdf"):
tree_df["timetree_num_date_confidence"] = [[NO_DATA_CHAR,NO_DATA_CHAR] for row in range(0,len(tree_df))]
tree_df["timetree_num_date_bar"] = [[NO_DATA_CHAR,NO_DATA_CHAR] for row in range(0,len(tree_df))]
# clock_length is the same as branch_length until running branch_length_to_years()
tree_df["timetree_clock_length"] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
# Make a copy to change branch_length
tt_copy = copy.deepcopy(tt)
tt_copy.branch_length_to_years()
CONF_BAD_RANGE = 10000
for c in tt_copy.tree.find_clades():
# Marginal Probability
if hasattr(c, "marginal_inverse_cdf"):
# Retrieve the region containing the confidence interval
conf = tt.get_max_posterior_region(c, fraction=CONFIDENCE)
conf_range = abs(conf[1] - conf[0])
if conf_range > CONF_BAD_RANGE:
print("Bad lower date estimated for:", c.name,"\t", conf_range, "\tSetting to", c.numdate)
conf[0] = c.numdate
conf[1] = c.numdate
# Set as lower and upper bounds on date
tree_df.at[c.name, "timetree_num_date_confidence"][0] = conf[0]
tree_df.at[c.name, "timetree_num_date_confidence"][1] = conf[1]
# Convert to YBP present for drawing bars
tree_df.at[c.name, "timetree_num_date_bar"][0] = CURRENT_YEAR - conf[0]
tree_df.at[c.name, "timetree_num_date_bar"][1] = CURRENT_YEAR - conf[1]
tree_df.at[c.name, "timetree_raw_date"] = c.date
tree_df.at[c.name, "timetree_num_date"] = c.numdate
tree_df.at[c.name, "timetree_clock_length"] = c.branch_length
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40 | # make a copy of the tree
tt_copy = copy.deepcopy(tt)
tt_copy.branch_length_to_years()
# Plotting the tree
tree_df["timetree_coord_x"] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
tree_df["timetree_coord_y"] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
# Plotting the regression
tree_df["timetree_reg_x"] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
tree_df["timetree_reg_y"] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
tree_df["timetree_reg_bad"] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
x_posns = get_x_positions(tt_copy.tree)
y_posns = get_y_positions(tt_copy.tree)
tt_reg = tt_copy.setup_TreeRegression()
# Add x and y coordinates
for c in tt_copy.tree.find_clades():
# Tree Node Coordinates
coord_x = [value for key,value in x_posns.items() if key.name == c.name][0]
coord_y = [value for key,value in y_posns.items() if key.name == c.name][0]
tree_df.at[c.name, 'timetree_coord_x'] = coord_x
tree_df.at[c.name, 'timetree_coord_y'] = coord_y
# Regression Node Coordinates
reg_y = c._v
if c.is_terminal():
reg_x = tt_reg.tip_value(c)
else:
reg_x = c.numdate
reg_bad = c.bad_branch if hasattr(c, 'bad_branch') else False
tree_df.at[c.name, 'timetree_reg_x'] = reg_x
tree_df.at[c.name, 'timetree_reg_y'] = reg_y
tree_df.at[c.name, 'timetree_reg_bad'] = reg_bad
# Fix up new values that could be none
tree_df.fillna(NO_DATA_CHAR, inplace=True)
tree_df
|
|
| ## Divergence coord_x and coord_y
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | tt_copy = copy.deepcopy(tt)
for n in tt_copy.tree.find_clades():
n.branch_length=n.mutation_length
x_posns = get_x_positions(tt_copy.tree)
y_posns = get_y_positions(tt_copy.tree)
# Add x and y coordinates as other attributes
for c in tt_copy.tree.find_clades():
# x coordinates will be of branch length units
coord_x = [value for key,value in x_posns.items() if key.name == c.name][0]
# y coordinates will be arbitrary, based on number of tips
coord_y = [value for key,value in y_posns.items() if key.name == c.name][0]
# Update coordinates in dataframe
tree_df.at[c.name, 'coord_x'] = coord_x
tree_df.at[c.name, 'coord_y'] = coord_y
# Visualize dataframe
tree_df
|
|
|
|
| if hasattr(tt, "merger_model"):
#skyline, conf = tt.merger_model.skyline_inferred(gen=50, confidence=N_STD)
#print(skyline)
#print(conf)
out_path_skyline_pdf = os.path.join(outdir, SCRIPT_NAME + "_skyline." + FMT )
out_path_skyline_txt = os.path.join(outdir, SCRIPT_NAME + "_skyline.tsv" )
treetime.wrappers.print_save_plot_skyline(tt,
plot=out_path_skyline_pdf,
save=out_path_skyline_txt,
screen=True,
n_std=2.0, )
|
|
| ## Add Metadata as Comments
|
|
| metadata_to_comment(tt.tree, tree_df)
|
|
|
|
|
|
| # Dataframe
out_path_df = os.path.join(outdir, SCRIPT_NAME + ".tsv" )
out_path_pickle_df = os.path.join(outdir, SCRIPT_NAME + ".df.obj" )
tree_df.to_csv(out_path_df, sep="\t")
with open(out_path_pickle_df,"wb") as outfile:
dill.dump(tree_df, outfile)
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 | tt_copy = copy.deepcopy(tt)
# Nexus
out_path_nexus = os.path.join(outdir, SCRIPT_NAME + "_timetree.nexus" )
Phylo.write(tt_copy.tree, out_path_nexus, 'nexus', format_branch_length='%1.{}f'.format(BRANCH_LEN_SIG_DIG))
# Dill object
out_path_dill_tree = os.path.join(outdir, SCRIPT_NAME + "_timetree.treetime.obj" )
with open(out_path_dill_tree,"wb") as outfile:
dill.dump(tt_copy, outfile)
# Newick (remove comments)
for c in tt_copy.tree.find_clades(): c.comment = None
out_path_nwk = os.path.join(outdir, SCRIPT_NAME + "_timetree.nwk" )
Phylo.write(tt_copy.tree, out_path_nwk, 'newick', format_branch_length='%1.{}f'.format(BRANCH_LEN_SIG_DIG))
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 | tt_copy = copy.deepcopy(tt)
# Convert to divtree
for n in tt_copy.tree.find_clades():
n.branch_length=n.mutation_length
# Nexus
out_path_nexus = os.path.join(outdir, SCRIPT_NAME + "_divtree.nexus" )
Phylo.write(tt_copy.tree, out_path_nexus, 'nexus', format_branch_length='%1.{}f'.format(BRANCH_LEN_SIG_DIG))
# Dill object
out_path_dill_tree = os.path.join(outdir, SCRIPT_NAME + "_divtree.treetime.obj" )
with open(out_path_dill_tree,"wb") as outfile:
dill.dump(tt_copy, outfile)
# Newick (remove comments)
for c in tt_copy.tree.find_clades(): c.comment = None
out_path_nwk = os.path.join(outdir, SCRIPT_NAME + "_divtree.nwk" )
Phylo.write(tt_copy.tree, out_path_nwk, 'newick', format_branch_length='%1.{}f'.format(BRANCH_LEN_SIG_DIG))
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 | augur_dict = augur_export(
tree_path=out_path_nwk,
aln_path=aln_path,
tree=tt.tree,
#tree=tree_div,
tree_df=tree_df,
color_keyword_exclude=["color", "coord", "reg", "lat", "lon"],
type_convert = {
"Branch_Number" : (lambda x : str(x))
},
)
print(augur_dict["nodes"]["Reference"])
if hasattr(tt, "merger_model"):
skyline, conf = tt.merger_model.skyline_inferred(gen=50, confidence=2)
augur_dict['skyline'] = [[float(x) for x in skyline.x], [float(y) for y in conf[0]],
[float(y) for y in skyline.y], [float(y) for y in conf[1]]]
out_path_augur_json = os.path.join(outdir, SCRIPT_NAME + "_augur.json" )
utils.write_json(data=augur_dict, file_name=out_path_augur_json, indent=JSON_INDENT)
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | auspice_dict = auspice_export(
tree=tt.tree,
augur_json_paths=[out_path_augur_json],
auspice_config_path=auspice_config_path,
auspice_colors_path=auspice_colors_path,
auspice_latlons_path=auspice_latlon_path,
)
# Write outputs - For Local Rendering
out_path_auspice_local_json = os.path.join(outdir, SCRIPT_NAME + "_auspice.json" )
utils.write_json(data=auspice_dict, file_name=out_path_auspice_local_json, indent=JSON_INDENT, include_version=False)
export_v2.validate_data_json(out_path_auspice_local_json)
print("Validation successful for local JSON.")
# Write outputs - For Remote Rendering
out_path_auspice_remote_json = os.path.join(auspice_remote_dir_path, AUSPICE_PREFIX + SCRIPT_NAME.replace("_","-") + ".json" )
utils.write_json(data=auspice_dict, file_name=out_path_auspice_remote_json, indent=JSON_INDENT, include_version=False)
export_v2.validate_data_json(out_path_auspice_remote_json)
print("Validation successful for remote JSON.")
|
|
|
|